from sklearn.feature_extraction.text import CountVectorizer
X=['风 格 宋 丹 丹','孙 丹 丹','王 丹 丹',]
count=CountVectorizer(token_pattern='[a-zA-Z|\u4e00-\u9fa5]+')
x=count.fit_transform(X)
print(count.get_feature_names())
print(x.A)
print(x.toarray())

from sklearn.feature_extraction.text import TfidfVectorizer
ti=TfidfVectorizer(norm=None,token_pattern='[a-zA-Z|\u4e00-\u9fa5]+')
x=ti.fit_transform(X)
print(ti.get_feature_names())
print(x.A)

# idf
import numpy as np
idf=np.log(4/2)+1
print(idf)